notebook.community

This notebook is used to create a graph between courses based on common themes identified using topic detection.



In [1]:

    
# Useful starting lines
%matplotlib inline
import os
import numpy as np
import scipy
import scipy.sparse as sp
import matplotlib.pyplot as plt
import pandas as pd
import re
import nltk
import string
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation, PCA, TruncatedSVD, SparsePCA
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from scipy import sparse, stats, spatial
import pickle

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

%load_ext autoreload
%autoreload 2









    



[nltk_data] Downloading package wordnet to /Users/laila/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/laila/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/laila/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!



In [2]:

    
# Load the courses descriptions dataframe
courses = pd.read_pickle("../data/cleaned_courses_STI.pickle")
courses.head()









    Out[2]:







  
    
      
      AcademicYear
      CourseTitleFR
      CourseTitleEN
      ProfessorSCIPERs
      AssistantSCIPERs
      KeyWords_FR
      KeyWords_EN
      StudyPlans
      Requirements
      Summary_Concepts_Contents_EN
      Summary_Concepts_Contents_FR
    
    
      CourseCode
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      BIOENG-404
      2016-2017
      Analysis and modelling of locomotion
      NaN
      220184;115955;104385
      266358;223203;170511;255602;181161;267756;210551
      [neurophysiology, motor system, l ocomotion, k...
      [neurophysiology, motor system, locomotion, ki...
      EDNE -; SV
      []
      [neuroprosthetics, and link to biped robot, ga...
      [la biomécanique, la modélisation numérique, l...
    
    
      BIOENG-430
      2016-2017
      Introduction to cellular and molecular biotech...
      NaN
      162095
      NaN
      [biotechnologie, expression génique, transgénè...
      [biotechnology, gene expression, trasngenesis,...
      CGC; Mineur : Biotechnologie -
      []
      [, the course present comparatively several to...
      [soulignant la réduction de l'impact environne...
    
    
      BIOENG-433
      2016-2017
      Biotechnology lab (for CGC)
      NaN
      110882
      252599;253981;273636;273637;243036;273638;2427...
      []
      []
      CGC; Mineur : Biotechnologie -
      [BIOENG-437]
      [they purify the recombinant protein and chara...
      [ce cours donnera l'opportunité de se familiar...
    
    
      BIOENG-437
      2016-2017
      Pharmaceutical biotechnology
      NaN
      110882
      NaN
      []
      []
      CGC; Mineur : Biotechnologie -
      []
      [the course will try to trace back the think a...
      [the course will try to trace back the think a...
    
    
      BIOENG-442
      2016-2017
      Biomaterials
      NaN
      106911;254787
      243036;260304;253981;266152
      []
      [cell, extracellular matrix, tissue, regenerat...
      CGC; Mineur : Neuroprosthétiques -; Mineur : T...
      []
      [this course cover the fundamental concept beh...
      [ce cours couvre le concept fondamentaux sur l...

1. Formatting Keywords for LDA

Helper functions to format the keywords



In [3]:

    
def keywords_to_list(columns, dataframe):
    # Create a keywords dataframe per column and store in a list
    keywords_df = list()
    # Enumerate along the columns: "keywords_EN, keywords_FR ..."
    for c, categ in enumerate(columns):
        # Make a copy
        df = dataframe[[categ]].copy()
        # Join the elements in the lists with the ;
        df[categ] = df[categ].apply(lambda x: ";".join(x))
        # Split the dataframe on the ;
        df = df[categ].str.split(";", expand = True)
    
        keywords = list()
        
        # Format all the strings 
        for col in df.columns:
            df[col] = df[col].str.replace(r'^ +', "")
            df[col] = df[col].str.replace(r' +$', "")
            df[col] = df[col].str.replace(r'/|\\', " ")   
            df[col] = df[col].str.replace(r'\r', "")
            df[col] = df[col].str.replace(r'\(|\)|:', "")
            
            # Add the new keywords
            keywords += list(df[col].unique())
            
        # Append the cleaned dataframe
        keywords_df.append(df)
        # Get the full list of keywords
        keywords = sorted(filter(None,list(set(keywords))))
    return keywords, keywords_df


def split_words(x):
    if type(x) is str:
        x = x.lower()
        for symbol in [";",",",".","\n"," - ", "- ","_", " "]:
            if type(x) is not str:
                temp = list()
                for word in x:
                    temp += word.split(symbol)
                x = temp
            else:
                x = x.split(symbol)
        
        return list(filter(None, x))
    else:
        return []



In [4]:

    
keywords_df = list()
list_col = ["KeyWords_EN", "Summary_Concepts_Contents_EN"]
language = "english"
keywords, keywords_df = keywords_to_list(list_col, courses)

print("There are {} distinct keywords in english".format(len(keywords)))









    



There are 955 distinct keywords in english



In [5]:

    
# Concatenate the results of the different columns to obtain one line per course
result = pd.concat(keywords_df, axis=1, join_axes=[keywords_df[0].index])#pd.concat(keywords_df,axis=1, join='outer')

lda_data = list()

for row in result.values:
    try:
        interm_data = " ".join(list(set(filter(None, row))))
    except:
        interm_data = list(set(filter(None, row)))
    lda_data.append(interm_data)

print("We have strings for {} courses".format(len(lda_data)))
 
result.head()









    



We have strings for 196 courses






    Out[5]:







  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
      ...
      21
      22
      23
      24
      25
      26
      27
      28
      29
      30
    
    
      CourseCode
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      BIOENG-404
      neurophysiology
      motor system
      locomotion
      kinematics
      gait analysis
      matlab
      numerical model
      robotics
      neuroprosthetics
      None
      ...
      None
      None
      None
      None
      None
      None
      None
      None
      None
      None
    
    
      BIOENG-430
      biotechnology
      gene expression
      trasngenesis
      biocatalysis
      green chemistry
      None
      None
      None
      None
      None
      ...
      None
      None
      None
      None
      None
      None
      None
      None
      None
      None
    
    
      BIOENG-433
      
      None
      None
      None
      None
      None
      None
      None
      None
      None
      ...
      None
      None
      None
      None
      None
      None
      None
      None
      None
      None
    
    
      BIOENG-437
      
      None
      None
      None
      None
      None
      None
      None
      None
      None
      ...
      None
      None
      None
      None
      None
      None
      None
      None
      None
      None
    
    
      BIOENG-442
      cell
      extracellular matrix
      tissue
      regeneration
      angiogenesis
      biodegradable material
      hydrogel
      drug delivery
      micro
      and nano-particles
      ...
      None
      None
      None
      None
      None
      None
      None
      None
      None
      None
    
  

5 rows × 54 columns



In [6]:

    
distinct_words = lda_data.copy()

distinct_words = [split_words(x) for x in lda_data]
distinct_words_per_course = [len(list(set(list(filter(None, x))))) for x in distinct_words]
distinct_words = sum(distinct_words, [])
distinct_words = list(set(list(filter(None, distinct_words))))
med_num_distinct_words = np.median(np.array(distinct_words_per_course))
print("We have {} distinct words which make up the strings".format(len(distinct_words)))
print("Median number of distinct words per course : {}".format(med_num_distinct_words))









    



We have 1919 distinct words which make up the strings
Median number of distinct words per course : 34.0

2. Isolating Topics Using LDA

Functions used to perform the LDA



In [7]:

    
def print_top_words(model, feature_names, n_top_words):
    topics_keywords = list()
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
        topics_keywords.append([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
    print()
    return topics_keywords


def perform_LDA(data_samples, language, n_samples, n_features, n_components, n_top_words):

    # Use tf (raw term count) features for LDA.
    print("Extracting tf features for LDA...")
    stop_words = stopwords.words(language)
    tf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                    max_features=n_features,
                                    stop_words=stop_words)
    
    tfidf = tf_vectorizer.fit_transform(data_samples)
    pca = PCA()
    pca_tfidf = pca.fit_transform(tfidf.toarray())

    print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..." % (n_samples, n_features))
    lda = LatentDirichletAllocation(n_components=n_components, max_iter=25,
                                    learning_method='batch',
                                    learning_offset=50.,
                                    random_state=0)
    
    result = lda.fit_transform(pca_tfidf-np.min(pca_tfidf))

    print("\nTopics in LDA model:")
    tf_feature_names = tf_vectorizer.get_feature_names()
    topics_keywords = print_top_words(lda, tf_feature_names, n_top_words)
    
    return result, lda

Performing LDA on the previously formatted keywords



In [8]:

    
n_samples = len(lda_data)
n_features = len(distinct_words)
n_components = 10
n_top_words = 20

print(n_samples, n_features)

result_lda,lda = perform_LDA(lda_data, language,n_samples, n_features, n_components, n_top_words)









    



196 1919
Extracting tf features for LDA...
Fitting LDA models with tf features, n_samples=196 and n_features=1919...

Topics in LDA model:
Topic #0: direct diode dimensional digital different differential difference diagnostic device development determine develop detection detail described design description depth describe depend
Topic #1: direct diode dimensional digital different differential difference diagnostic device development determine develop detection detail described design description depth describe depend
Topic #2: direct diode dimensional digital different differential difference diagnostic device development determine develop detection detail described design description depth describe depend
Topic #3: 6ghz aberration ability able abstract acoustic acquaint acquire acquires acquisition active act activity actuator add addition address adaptive advance afm
Topic #4: direct diode dimensional digital different differential difference diagnostic device development determine develop detection detail described design description depth describe depend
Topic #5: direct diode dimensional digital different differential difference diagnostic device development determine develop detection detail described design description depth describe depend
Topic #6: direct diode dimensional digital different differential difference diagnostic device development determine develop detection detail described design description depth describe depend
Topic #7: direct diode dimensional digital different differential difference diagnostic device development determine develop detection detail described design description depth describe depend
Topic #8: direct diode dimensional digital different differential difference diagnostic device development determine develop detection detail described design description depth describe depend
Topic #9: direct diode dimensional digital different differential difference diagnostic device development determine develop detection detail described design description depth describe depend



In [9]:

    
plt.matshow(lda.components_)
plt.colorbar()









    Out[9]:





<matplotlib.colorbar.Colorbar at 0x10fc77d68>



In [10]:

    
result_lda









    Out[10]:





array([[ 0.00146404,  0.00146404,  0.00146404, ...,  0.00146404,
         0.00146404,  0.00146404],
       [ 0.00145657,  0.00145657,  0.00145657, ...,  0.00145657,
         0.00145657,  0.00145657],
       [ 0.0014606 ,  0.0014606 ,  0.0014606 , ...,  0.0014606 ,
         0.0014606 ,  0.0014606 ],
       ..., 
       [ 0.00142359,  0.00142359,  0.00142359, ...,  0.00142359,
         0.00142359,  0.00142359],
       [ 0.00146378,  0.00146378,  0.00146378, ...,  0.00146378,
         0.00146378,  0.00146378],
       [ 0.00143327,  0.00143327,  0.00143327, ...,  0.00143327,
         0.00143327,  0.00143327]])

3. Computing the Distance Between Courses Based on the LDA Results



In [11]:

    
feature_vectors = result_lda.copy()
feature_vectors -= feature_vectors.mean(axis=0)
feature_vectors /= feature_vectors.std(axis=0)
distances = spatial.distance.pdist(feature_vectors, metric = "cosine")
distances = spatial.distance.squareform(distances) 
kernel_width = (distances.mean())
weights = np.exp(-np.power(distances, 2)/(kernel_width**2))
weights[np.diag_indices_from(weights)] = 0


plt.matshow(weights);
plt.colorbar();

Saving the weight matrix



In [12]:

    
pkl_file = open(os.path.join(os.getcwd(), "Graphs","topics_graph.pkl"), "wb")
pickle.dump(weights, pkl_file)
pkl_file.close()



In [ ]:

	AcademicYear	CourseTitleFR	CourseTitleEN	ProfessorSCIPERs	AssistantSCIPERs	KeyWords_FR	KeyWords_EN	StudyPlans	Requirements	Summary_Concepts_Contents_EN	Summary_Concepts_Contents_FR
CourseCode
BIOENG-404	2016-2017	Analysis and modelling of locomotion	NaN	220184;115955;104385	266358;223203;170511;255602;181161;267756;210551	[neurophysiology, motor system, l ocomotion, k...	[neurophysiology, motor system, locomotion, ki...	EDNE -; SV	[]	[neuroprosthetics, and link to biped robot, ga...	[la biomécanique, la modélisation numérique, l...
BIOENG-430	2016-2017	Introduction to cellular and molecular biotech...	NaN	162095	NaN	[biotechnologie, expression génique, transgénè...	[biotechnology, gene expression, trasngenesis,...	CGC; Mineur : Biotechnologie -	[]	[, the course present comparatively several to...	[soulignant la réduction de l'impact environne...
BIOENG-433	2016-2017	Biotechnology lab (for CGC)	NaN	110882	252599;253981;273636;273637;243036;273638;2427...	[]	[]	CGC; Mineur : Biotechnologie -	[BIOENG-437]	[they purify the recombinant protein and chara...	[ce cours donnera l'opportunité de se familiar...
BIOENG-437	2016-2017	Pharmaceutical biotechnology	NaN	110882	NaN	[]	[]	CGC; Mineur : Biotechnologie -	[]	[the course will try to trace back the think a...	[the course will try to trace back the think a...
BIOENG-442	2016-2017	Biomaterials	NaN	106911;254787	243036;260304;253981;266152	[]	[cell, extracellular matrix, tissue, regenerat...	CGC; Mineur : Neuroprosthétiques -; Mineur : T...	[]	[this course cover the fundamental concept beh...	[ce cours couvre le concept fondamentaux sur l...

	0	1	2	3	4	5	6	7	8	9	...	21	22	23	24	25	26	27	28	29	30
CourseCode
BIOENG-404	neurophysiology	motor system	locomotion	kinematics	gait analysis	matlab	numerical model	robotics	neuroprosthetics	None	...	None	None	None	None	None	None	None	None	None	None
BIOENG-430	biotechnology	gene expression	trasngenesis	biocatalysis	green chemistry	None	None	None	None	None	...	None	None	None	None	None	None	None	None	None	None
BIOENG-433		None	None	None	None	None	None	None	None	None	...	None	None	None	None	None	None	None	None	None	None
BIOENG-437		None	None	None	None	None	None	None	None	None	...	None	None	None	None	None	None	None	None	None	None
BIOENG-442	cell	extracellular matrix	tissue	regeneration	angiogenesis	biodegradable material	hydrogel	drug delivery	micro	and nano-particles	...	None	None	None	None	None	None	None	None	None	None